beers = read.csv('data/Beers.csv')
breweries = read.csv('data/Breweries.csv')

str(beers)
## 'data.frame':    2410 obs. of  7 variables:
##  $ Name      : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1638 577 1705 1842 1819 268 1160 758 1093 486 ...
##  $ Beer_ID   : int  1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
##  $ ABV       : num  0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
##  $ IBU       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Brewery_id: int  409 178 178 178 178 178 178 178 178 178 ...
##  $ Style     : Factor w/ 100 levels "","Abbey Single Ale",..: 19 18 16 12 16 80 18 22 18 12 ...
##  $ Ounces    : num  12 12 12 12 12 12 12 12 12 12 ...
str(breweries)
## 'data.frame':    558 obs. of  4 variables:
##  $ Brew_ID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name   : Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 12 266 319 201 136 227 477 59 491 ...
##  $ City   : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 200 122 299 300 62 91 48 152 136 ...
##  $ State  : Factor w/ 51 levels " AK"," AL"," AR",..: 24 18 20 5 5 41 6 23 23 23 ...
# Both dataframes have column called 'Name'.
# In beers, it refers to the name of the beer
# In breweries, it refers to the name of the breweries.
# Let's rename for clarity, especially after merging.

names(beers)[names(beers) %in% 'Name'] <- 'Beer'
names(breweries)[names(breweries) %in% 'Name'] <- 'Brewery'

str(beers)
## 'data.frame':    2410 obs. of  7 variables:
##  $ Beer      : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1638 577 1705 1842 1819 268 1160 758 1093 486 ...
##  $ Beer_ID   : int  1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
##  $ ABV       : num  0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
##  $ IBU       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Brewery_id: int  409 178 178 178 178 178 178 178 178 178 ...
##  $ Style     : Factor w/ 100 levels "","Abbey Single Ale",..: 19 18 16 12 16 80 18 22 18 12 ...
##  $ Ounces    : num  12 12 12 12 12 12 12 12 12 12 ...
str(breweries)
## 'data.frame':    558 obs. of  4 variables:
##  $ Brew_ID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Brewery: Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 12 266 319 201 136 227 477 59 491 ...
##  $ City   : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 200 122 299 300 62 91 48 152 136 ...
##  $ State  : Factor w/ 51 levels " AK"," AL"," AR",..: 24 18 20 5 5 41 6 23 23 23 ...

Question 1

How many breweries are present in each state?

# Option 1
# breweriesState = breweries %>%
#   group_by(State) %>%
#   summarise(Breweries = n()) %>%
#   ungroup()

breweriesState = breweries %>% count(State) # Option 2
DT::datatable(breweriesState,rownames = F)
str(breweriesState)
## Classes 'tbl_df', 'tbl' and 'data.frame':    51 obs. of  2 variables:
##  $ State: Factor w/ 51 levels " AK"," AL"," AR",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ n    : int  7 3 2 11 39 47 8 1 2 15 ...
# https://rstudio.github.io/leaflet/map_widget.html
mapStates = map("state", fill = TRUE, plot = FALSE)
str(mapStates)
## List of 4
##  $ x    : num [1:15599] -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ y    : num [1:15599] 30.4 30.4 30.4 30.3 30.3 ...
##  $ range: num [1:4] -124.7 -67 25.1 49.4
##  $ names: chr [1:63] "alabama" "arizona" "arkansas" "california" ...
##  - attr(*, "class")= chr "map"
mapStates$names 
##  [1] "alabama"                         "arizona"                        
##  [3] "arkansas"                        "california"                     
##  [5] "colorado"                        "connecticut"                    
##  [7] "delaware"                        "district of columbia"           
##  [9] "florida"                         "georgia"                        
## [11] "idaho"                           "illinois"                       
## [13] "indiana"                         "iowa"                           
## [15] "kansas"                          "kentucky"                       
## [17] "louisiana"                       "maine"                          
## [19] "maryland"                        "massachusetts:martha's vineyard"
## [21] "massachusetts:main"              "massachusetts:nantucket"        
## [23] "michigan:north"                  "michigan:south"                 
## [25] "minnesota"                       "mississippi"                    
## [27] "missouri"                        "montana"                        
## [29] "nebraska"                        "nevada"                         
## [31] "new hampshire"                   "new jersey"                     
## [33] "new mexico"                      "new york:manhattan"             
## [35] "new york:main"                   "new york:staten island"         
## [37] "new york:long island"            "north carolina:knotts"          
## [39] "north carolina:main"             "north carolina:spit"            
## [41] "north dakota"                    "ohio"                           
## [43] "oklahoma"                        "oregon"                         
## [45] "pennsylvania"                    "rhode island"                   
## [47] "south carolina"                  "south dakota"                   
## [49] "tennessee"                       "texas"                          
## [51] "utah"                            "vermont"                        
## [53] "virginia:chesapeake"             "virginia:chincoteague"          
## [55] "virginia:main"                   "washington:san juan island"     
## [57] "washington:lopez island"         "washington:orcas island"        
## [59] "washington:whidbey island"       "washington:main"                
## [61] "west virginia"                   "wisconsin"                      
## [63] "wyoming"
# This will not work becasue of the way states are defined; would need major rework
# example state names are "Arizona", but we have "AZ" in out dataframe. 
# Also, some states are named "michigan:north", "michigan:south", etc.

# Anyway adding some examples here in case we want to explore this further
leaflet(data = mapStates) %>% addTiles() %>%
  addPolygons(fillColor = topo.colors(10, alpha = NULL), stroke = FALSE)
m = leaflet() %>% addTiles()
df = data.frame(
  lat = rnorm(100),
  lng = rnorm(100),
  size = runif(100, 5, 20),
  color = sample(colors(), 100)
)
m = leaflet(df) %>% addTiles()
m %>% addCircleMarkers(radius = ~size, color = ~color, fill = FALSE)
## Assuming "lng" and "lat" are longitude and latitude, respectively
m %>% addCircleMarkers(radius = runif(100, 4, 10), color = c('red'))
## Assuming "lng" and "lat" are longitude and latitude, respectively
states <- geojsonio::geojson_read("http://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_outline_20m.json", what = "sp")
class(states)
## [1] "SpatialLinesDataFrame"
## attr(,"package")
## [1] "sp"
names(states)
## [1] "TYPE"      "R_STATEFP" "L_STATEFP"
#head(states)

# Not sure yet if I need to get a special token for this.
# Below code is only plotting the boundary of US, not the states
# We can also try with other JSON files. They are available in the same link path as above.
m <- leaflet(states) %>%
  setView(-96, 37.8, 4) %>%
  addProviderTiles("MapBox", options = providerTileOptions(
    id = "mapbox.light",
    accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) 

m %>% addPolygons()
# https://plot.ly/r/choropleth-maps/
df <- read.csv("https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv")
df$hover <- with(df, paste(state, '<br>', "Beef", beef, "Dairy", dairy, "<br>",
                           "Fruits", total.fruits, "Veggies", total.veggies,
                           "<br>", "Wheat", wheat, "Corn", corn))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

p <- plot_geo(df, locationmode = 'USA-states') %>%
  add_trace(
    z = ~total.exports, text = ~hover, locations = ~code,
    color = ~total.exports, colors = 'Purples'
  ) %>%
  colorbar(title = "Millions USD") %>%
  layout(
    title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
    geo = g
  )
## Warning: package 'bindrcpp' was built under R version 3.4.4
p

Question 2

Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file

data = merge(beers,breweries,by.x='Brewery_id',by.y='Brew_ID',all=T) 

h4('First 6 Observations')

First 6 Observations

DT::datatable(head(data,6),rownames = F)
h4('Last 6 Observations')

Last 6 Observations

DT::datatable(tail(data,6),rownames = F)

Question 3

Report the number of NA’s in each column

countNA = sapply(data,function(x){sum(is.na(x))})
kable(countNA,col.names='Count of NA')
Count of NA
Brewery_id 0
Beer 0
Beer_ID 0
ABV 62
IBU 1005
Style 0
Ounces 0
Brewery 0
City 0
State 0

Question 4

Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare

summary = data %>%
  group_by(State) %>%
  summarise(ABVMedian = median(ABV,na.rm=T)
            ,IBUMedian = median(IBU,na.rm=T)
            ) %>%
  ungroup() 

summary(summary)
##      State      ABVMedian         IBUMedian    
##   AK    : 1   Min.   :0.04000   Min.   :19.00  
##   AL    : 1   1st Qu.:0.05500   1st Qu.:30.00  
##   AR    : 1   Median :0.05600   Median :35.00  
##   AZ    : 1   Mean   :0.05585   Mean   :36.98  
##   CA    : 1   3rd Qu.:0.05800   3rd Qu.:42.75  
##   CO    : 1   Max.   :0.06250   Max.   :61.00  
##  (Other):45                     NA's   :1
# Question: Why do we need ungroup()? The code below works just fine without ungroup()
summary = data %>%
  group_by(State) %>%
  summarise(ABVMedian = median(ABV,na.rm=T) 
            ,IBUMedian = median(IBU,na.rm=T)
            ) 
  
summary(summary)
##      State      ABVMedian         IBUMedian    
##   AK    : 1   Min.   :0.04000   Min.   :19.00  
##   AL    : 1   1st Qu.:0.05500   1st Qu.:30.00  
##   AR    : 1   Median :0.05600   Median :35.00  
##   AZ    : 1   Mean   :0.05585   Mean   :36.98  
##   CA    : 1   3rd Qu.:0.05800   3rd Qu.:42.75  
##   CO    : 1   Max.   :0.06250   Max.   :61.00  
##  (Other):45                     NA's   :1
# There is an NA value. Which one is it?
summary[rowSums(is.na(summary)) > 0,]
## # A tibble: 1 x 3
##   State ABVMedian IBUMedian
##   <fct>     <dbl>     <dbl>
## 1 " SD"      0.06        NA
# SD has NA values for IBUMedian; need to remove before plotting
# Why does SD have NA values even though we removed NA values when we calculated median?
# Reason: na.rm will remove NA values before applying median function. However, if a state has all NA values 
# for a field (for example SD has all NA value for IBU), the summarize will add a NA value for that State

trim <- function (x) gsub("^\\s+|\\s+$", "", x)
data$State <- trim(data$State)
data[data$State == 'SD', ]
##      Brewery_id                      Beer Beer_ID   ABV IBU
## 1237        213 Red Water Irish Style Red    2145 0.065  NA
## 1238        213                 Mjöllnir    1804 0.066  NA
## 1239        213  Bear Butte Nut Brown Ale    1602 0.055  NA
## 1240        213    Easy Livin' Summer Ale    1301 0.045  NA
## 1241        213          Canyon Cream Ale     542 0.055  NA
## 1242        213        Pile O'Dirt Porter     272 0.069  NA
## 1243        213             11th Hour IPA     271 0.060  NA
##                         Style Ounces                   Brewery      City
## 1237 American Amber / Red Ale     12 Crow Peak Brewing Company Spearfish
## 1238     Herbed / Spiced Beer     12 Crow Peak Brewing Company Spearfish
## 1239       American Brown Ale     12 Crow Peak Brewing Company Spearfish
## 1240      American Blonde Ale     12 Crow Peak Brewing Company Spearfish
## 1241                Cream Ale     12 Crow Peak Brewing Company Spearfish
## 1242          American Porter     12 Crow Peak Brewing Company Spearfish
## 1243             American IPA     12 Crow Peak Brewing Company Spearfish
##      State
## 1237    SD
## 1238    SD
## 1239    SD
## 1240    SD
## 1241    SD
## 1242    SD
## 1243    SD
ggplot(data=filter(summary,!is.na(ABVMedian))
       ,aes(x=fct_reorder(State,ABVMedian,desc=T)
                        ,y=ABVMedian)) +
  geom_col() +
  xlab("State") +
  ylab("Median Alcohol Content") +
  scale_y_continuous(labels=percent)+
  coord_flip()

ggplot(data=filter(summary,!is.na(IBUMedian))
       ,aes(x=fct_reorder(State,IBUMedian,desc=T)
                        ,y=IBUMedian)) +
  geom_col() +
  xlab("State") +
  ylab("Median International Bitterness Unit") +
  coord_flip()

Question 5

Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

message("The State with the maximum alcoholic (ABV) beer is:"
        ,arrange(data,desc(ABV))$State[1]
        )
## The State with the maximum alcoholic (ABV) beer is:CO
message("The State with the most bitter (IBU) beer is:"
        ,arrange(data,desc(IBU))$State[1]
        )
## The State with the most bitter (IBU) beer is:OR

Question 6

Summary statistics for the ABV variable:

summary(data$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800      62

Question 7

Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot.

ggplot(data=data,aes(x=IBU,y=ABV))+
  geom_point() +
  geom_smooth(method='lm') +
  scale_y_continuous(labels=percent)

model=lm(data=data,formula=ABV~IBU)
model
## 
## Call:
## lm(formula = ABV ~ IBU, data = data)
## 
## Coefficients:
## (Intercept)          IBU  
##   0.0449303    0.0003508

We have a very slight positive correlation between ABV and IBU, with a coefficient of 0.0351 of increase % Alcohol per IBU point